In this script I define PURA binding sites from the IH-AB FLAG-PURA iCLIP data set. The following steps were performed in this order:
Definition of Binding sites and filtering
PureCLIP peak calling (on merged sample1+2 sample3+4)
Global filter of pureclip sites (>5 cl events on pureclip site)
Local (genewise) filter of pureclip sites ( > 75% percentile of gene)
Binding site definition (5nt, Matrix approach, discard BS where center is not max pureclip, has less then 2 sites with crosslinks)
Reproducibility filtering (soft boundary: 0.05% percentile of cl per binding site per sample, repro in min 3 of 4 samples)
Comparison with old BS definition pipeline
Analysis of biniding behaviour
Analysis of bound gene types
Analysis of bound gene regions
2 Input
Show code
raw_path <-"/Users/melinaklostermann/Documents/projects/PURA/01_raw_data/oe-imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/"# crosslink sites in bw format###########################################all 4 sample merged by Ankebw_all_plus_path <-paste0(raw_path, "merged/bw/imb_koenig_2019_11_allsamples.v2uniqMD.duprm.plus.bw")bw_all_minus_path <-paste0(raw_path, "merged/bw/imb_koenig_2019_11_allsamples.v2uniqMD.duprm.minus.bw")# single samplesbw_1_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample1.v2uniqMD.duprm.plus.bw")bw_1_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample1.v2uniqMD.duprm.minus.bw")bw_2_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample2.v2uniqMD.duprm.plus.bw")bw_2_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample2.v2uniqMD.duprm.minus.bw")bw_3_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample3.v2uniqMD.duprm.plus.bw")bw_3_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample3.v2uniqMD.duprm.minus.bw")bw_4_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample4.v2uniqMD.duprm.plus.bw")bw_4_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample4.v2uniqMD.duprm.minus.bw")#bw_all_samples_path <-"bw_4samples.RData"bw_merges_path <-"bw_merges.RData"# pureclip calls ####################################(obtained by running pureclip on pseudo samples 1u2 and 3u4 see below)pureclip_path <-"/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/01-BS_def/01-BS_def_oe/PureCLIP"# gencode annotation v31################################## this is the filtered annotation as used in molitor et alannotation <-readRDS("/Users/melinaklostermann/Documents/projects/PURA/Molitor-et-al-2022/annotation.rds")anno_txdb <-makeTxDbFromGRanges(annotation)output_path <-"/Users/melinaklostermann/Documents/projects/Thesis/Thesis_code/PURA/01_binding_site_definition_oePURA/"
3 Look on raw bws
sample1 is the strongest and sample2 the weakest sample. Still all samples have a very nice depth in signal.
sample
CLsites
CLevents
CLeventsPerSite
sample 1 +
4580865
10723300
2.340890
sample 1 -
4438230
10599106
2.388138
sample 2 +
3551704
7293909
2.053637
sample 2 -
3451542
7225468
2.093403
sample 3 +
3351243
7978970
2.380899
sample 3 -
3257938
7901927
2.425438
sample 4 +
4196250
9640822
2.297485
sample 4 -
4068269
9554433
2.348525
all samples +
9623431
35637001
3.703149
all samples -
9229013
35280934
3.822828
Distribution of CL sites and events in the 4 samples
4 PureCLIP peak calling
PureCLIP was performed on the bam files of 2 pseudo samples (as it can only use 2 samples at the moment): A merge of sample 1 and 2 and a merge of sample 3 and 4. The bam files of all 4 samples were obtained from Ankes pipeline. The files were merged with samtools merge. This is the bash code that was executed on the server:
Show code
# Mergemelina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$conda activate(base)melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$samtools merge sample1u2.v2uniqMD.bam imb_koenig_2019_11_sample1.v2uniqMD.duprm.bamimb_koenig_2019_11_sample2.v2uniqMD.duprm.bam(base)melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$samtools merge sample3u4.v2uniqMD.bam imb_koenig_2019_11_sample3.v2uniqMD.duprm.bamimb_koenig_2019_11_sample4.v2uniqMD.duprm.bam(base)melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$samtools index sample1u2.v2uniqMD.bam (base)melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$samtools index sample3u4.v2uniqMD.bam <p># Copy to share projectmelina@gateway:/share/project/zarnack/melina/pureclip_pura_oe_pip2$cp ~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR/sample* .# Get genome annotationmelina@gateway:/share/project/zarnack/melina/pureclip_pura_oe_pip2$wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/GRCh38.p12.genome.fa.gzmelina@gateway:/share/project/zarnack/melina/pureclip_pura_oe_pip2$gunzip GRCh38.p12.genome.fa.gz#PureCLIP Script#!/bin/bash <p>#SBATCH --partition=all <p>#SBATCH --cpus-per-task=6 <p>#SBATCH --mem-per-cpu=32gb <p>#SBATCH --job-name="PureCLIP_PURA_oe_pip2" <p>#SBATCH --array=1 <p>#SBATCH --output=pureclip_pura_o2_pip2.out # Standard output and error log <p>#SBATCH --error=pureclip_pura_oe_pip2.err <p>#SBATCH --mail-user=melinaklostermann@googlemail.com <p>#SBATCH --mail-type=ALL <p>echo This is task \$SLURM_ARRAY_TASK_IDconda activatepureclip-i /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample1u2.v2uniqMD.bam -bai /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample1u2.v2uniqMD.bam.bai -i /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample3u4.v2uniqMD.bam-bai /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample3u4.v2uniqMD.bam.bai -g /share/project/zarnack/melina/pureclip_pura_oe_pip2/GRCh38.p12.genome.fa -o peakcalling_pura_oe_pip2_sites.bed -or peakcalling_pura_oe_pip2_regions.bed -nt 10#Run job(base)melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/PureCLIP_PURA_oe_pip2$sbatch jobscript_Pureclip_PURA_oe_pip2.sh <p>(base)melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/PureCLIP_PURA_oe_pip2$squeue
PureCLIP calls are loaded into R. The metadata of the resulting GRAnges object is cleaned up, as the columns get wrong names and empty columns are intruduced, when importing .bed files as GRanges objects (not shown).
Gene annotations are megred to PureCLIP sites. For this the annotation is filtered for overlapping genes. Of two overlaping genes the longer is kept.
Show code
annotation_genes <- annotation[annotation$type=="gene"] # sortiere anno nach laenge, damit bei uberlappenden genen das laengere (erste) behalten wirdannotation_genes <-sort(annotation_genes, by =~ width + seqnames + start)# remove ueberlappende geneannotation_genes$dup <-duplicated.GenomicRanges(annotation_genes)annotation_genes_dup_rem <- annotation_genes[annotation_genes$dup == F]# get genes that belong to pureclip sitepureclip_sites_anno <-subsetByOverlaps(pureclip_sites, annotation_genes_dup_rem)# if start of gene 1 = end of gene 2 keep firstidx <-findOverlaps(pureclip_sites_anno, annotation_genes_dup_rem, ignore.strand =FALSE, minoverlap = 1L, select ="first")# remove pureclip sites that are not annotated on a genepureclip_sites_anno[isNA(idx)] <-NULLidx <-na.omit(idx)# combine metadata of gene anno und pureclip siteselementMetadata(pureclip_sites_anno) <-cbind(elementMetadata(pureclip_sites_anno),elementMetadata(annotation_genes_dup_rem[idx]))# remove empty columns (empty for genes)pureclip_sites_anno$transcript_id <-NULLpureclip_sites_anno$transcript_type <-NULLpureclip_sites_anno$transcript_name <-NULLpureclip_sites_anno$transcript_support_level <-NULLpureclip_sites_anno$havana_transcript <-NULLpureclip_sites_anno$hgnc_id <-NULLpureclip_sites_anno$exon_number <-NULLpureclip_sites_anno$exon_id <-NULLpureclip_sites_anno$protein_id <-NULLpureclip_sites_anno$ccdsid <-NULLpureclip_sites_anno$ont <-NULLsave(pureclip_sites_anno, file=paste0(output_path,"peakcalling_pura_oe_sites_with_geneanno.RData"))save(annotation_genes_dup_rem, file=paste0(output_path,"annotation.RData"))
5.1 How many Pureclip sites are there per gene?
A problem for this normalisation are obviously genes with very few or only one pureclip call. This is the distribution of PureCLIP calls per gene:
Of all bound genes 98.45% contain more than 5 PureCLIP calls. Therefore, the calculation of the SOB on PureCLIP sites should work fine.
6 iCLIP2: loads of signal – Global and local (genewide) filter
As this data set has a very high signal depth (which is nice), also potential background signals are amplified. In order to focus on strong binding events, weaker PureCLIP sites are filtered out. A global and a local (genewide) filter are applied. The idea of the gene wide filter is that genes which were less abundant within the samples, will also have a weaker signal. A strong global filter would therefore not only filter strong binding events, but might introduce a bias towards strongly expressed genes. In the local filter one can filter more stringendly without introducing this bias. However, one has to keep in mind that the PURA homodimer in theory has 6 RNA binding domains, and its binding to a RNA might also depent on two or more adjecant binding sites.
6.1 Global filter
To choose a global filter, we can look at the distribution of the mean cl events on pureclip sites per gene and the quantiles of cl events on all pureclip sites.
As the lowest mean cl events on pureclip sites per gene are 4 and 5, a filter above 5 will already remove no background signal on these genes. As the data is from 4 replicates 5 cl events, could mean 3 samples with 1 cl and one sample with 2 cl (other distributions are also possible), which should be a minimum to trust a peak. If a filter of 5 cl events per pureclip site is applied 0 % of the pureclip sites are discarded.
6.2 Genewise filter
How many BS per gene make sense?
Pura dimer has in theory 6 RNA Binding sites, could also bind several times the same RNA
The advantage of the genewise filter is, that it will not introduce a bias towards strongly expressed genes. However, a to stringend genewise filter might not account for more complex binding patterns. As PURA might bind at more than one site at the same time and a binding site is more than 1 nt long, the genewise filter should allow several PureCLIp sites per gene.
On the other hand for the intended following analysis in theory one BS per gene would be sufficient.
To choose a suitable percentile for filtering, percentiles in steps of 5 are calculated for all genes. These can then be plotted in a boxplot to see their distribution:
Show code
##################filter pureclip hits local################make purr map with different percentilesp <-seq(from=0, to=1, by=0.05)p_names <- purrr::map_chr(p, ~paste0(.x*100, "%"))#write a function with purrr to calulate the different percentilesp_funs <- purrr::map(p, ~purrr::partial(quantile, probs = .x, na.rm =TRUE)) %>% purrr::set_names(nm = p_names)#use dyplr to calculate different percentiles for each genequant_genewise <- pureclip_sites_anno %>%as.data.frame(.) %>% dplyr::group_by(gene_id) %>% dplyr::summarize_at("score", funs(!!!p_funs))%>% as.data.framehead(quant_genewise)
In the end the 75% lower percentile per gene is filtered out. # Binding site definition
Binding sites are obtained, by merging all pureclip sites with a gap <= 8, and then splitting up the large BS obtained from the merging. The large binding sites are split up using a rle matrix. This approach requires afterwards to throw out Binding sites (BS) whose center is no Pureclip site or does not have the maximun Pureclip score.
Show code
#################################### Binding site Definition####################################Parameters:#pureclip - Granges containing PureCLIP sites# bw_plus, bw_minus - path to bw files# windowsize - size thet Bindingsites should have (should be uneven)Define_Binding_Sites <-function(pureclip, bw_plus, bw_minus, windowsize, out){# Merge Gaps < 8 from single pureclip sitespureclip =reduce(pureclip, min.gapwidth =8)#remove sites with 1 or 2 nt lengthpureclip = pureclip[width(pureclip) >2] bw_plus =import.bw(bw_plus, as="Rle") bw_minus =import.bw(bw_minus, as="Rle") final.peaks.plus.gr <-GRanges() final.peaks.minus.gr <-GRanges()#Initialize the remaining PureCLIP CL regions to check for peaks remaining.regions.plus.gr <-subset(pureclip, strand =="+") remaining.regions.minus.gr <-subset(pureclip, strand =="-") window.radius <- (windowsize-1)/2while(TRUE){#No regions left to check for peaksif (length(remaining.regions.plus.gr) ==0&length(remaining.regions.minus.gr) ==0){break }if (length(remaining.regions.plus.gr) !=0 ){#Get the raw CL counts in the remaining PureCLIP CL regions# returns rle list of all regions and turns it into matrix raw.remaining.PureCLIP.CL.regions.plus.m <-as.matrix(bw_plus[remaining.regions.plus.gr])#Identify the center of the PureCLIP CL regions (Position with max counts)# and store its indice raw.remaining.PureCLIP.CL.regions.plus.m[is.na(raw.remaining.PureCLIP.CL.regions.plus.m)] <--Inf# set Na to -infinite max.pos.indice.plus <-max.col(raw.remaining.PureCLIP.CL.regions.plus.m, ties.method ="first")#Create a peak region of xnt that is centered to the max position peaks.plus.gr <- remaining.regions.plus.grstart(peaks.plus.gr) <-start(peaks.plus.gr) + max.pos.indice.plus -1end(peaks.plus.gr) <-start(peaks.plus.gr) peaks.plus.gr <- peaks.plus.gr + window.radius#Store the new peaks final.peaks.plus.gr <-c(final.peaks.plus.gr, peaks.plus.gr)#Remove the peaks from the CL regions to search for additional peaks#Excise additionally x nucleotides up and downstream peaks.plus.grl <-as(peaks.plus.gr+window.radius, "GRangesList") remaining.regions.plus.gr <-unlist(psetdiff(remaining.regions.plus.gr, peaks.plus.grl)) }if (length(remaining.regions.minus.gr) !=0 ){#Get the raw CL counts in the remaining PureCLIP CL regions# returns rle list of all regions and turns it into matrix raw.remaining.PureCLIP.CL.regions.minus.m <-as.matrix( bw_minus[remaining.regions.minus.gr])#Identify the center of the PureCLIP CL regions (Position with max counts) #and store its indice raw.remaining.PureCLIP.CL.regions.minus.m[is.na(raw.remaining.PureCLIP.CL.regions.minus.m)] <--Inf max.pos.indice.minus <-max.col(raw.remaining.PureCLIP.CL.regions.minus.m, ties.method ="last")#Create a peak region of xnt that is centered to the max position peaks.minus.gr <- remaining.regions.minus.grstart(peaks.minus.gr) <-start(peaks.minus.gr) + max.pos.indice.minus -1end(peaks.minus.gr) <-start(peaks.minus.gr) peaks.minus.gr <- peaks.minus.gr + window.radius#Store the new peaks final.peaks.minus.gr <-c(final.peaks.minus.gr, peaks.minus.gr)#Remove the peaks from the CL regions to search for additional peaks#Excise additionally x nucleotides up and downstream peaks.minus.grl <-as(peaks.minus.gr+window.radius, "GRangesList") remaining.regions.minus.gr <-unlist(psetdiff(remaining.regions.minus.gr, peaks.minus.grl)) } }export(final.peaks.plus.gr, con=paste(out,"_plus.bed", sep =""), format ="bed")export(final.peaks.minus.gr, con=paste(out,"_minus.bed", sep =""), format ="bed")save(final.peaks.minus.gr, file=paste(out,"_plus.RData", sep =""))save(final.peaks.plus.gr, file=paste(out,"_minus.RData", sep ="")) returnlist <-list(peaks.minus = final.peaks.minus.gr, peaks.plus = final.peaks.plus.gr)return(returnlist)}pureclip_sites_anno <-makeGRangesFromDataFrame(pureclip_sites_anno,keep.extra.columns =TRUE)binding_sites <-Define_Binding_Sites(pureclip = pureclip_sites_anno, bw_plus = bw_all_plus_path, bw_minus = bw_all_minus_path,windowsize =5, out =paste0(output_path,"Binding_site_windows_5nt" ))
6.3 Throw out BS where not pureclip site or not max pureclip site
Show code
load(paste0(output_path,"Binding_site_windows_5nt_plus.RData"))load(paste0(output_path,"Binding_site_windows_5nt_minus.RData"))############################# Keep only BS with PureCLIP center############################# get all BSbinding_sites <-c(final.peaks.minus.gr, final.peaks.plus.gr)#get centersBS_centers <- binding_sites -2#keep only overlaps with pureclip sitespureclip_sites_anno <-makeGRangesFromDataFrame(pureclip_sites_anno, keep.extra.columns =TRUE)binding_sites_center_PS <- binding_sites[queryHits(findOverlaps( BS_centers, pureclip_sites_anno))]############################ Keep only BS with max PureCLIP site at center########################### get bw rlesbw_plus_rle <-import.bw(bw_all_plus_path, as="Rle")bw_minus_rle <-import.bw(bw_all_minus_path, as="Rle")# split BS by strandbinding_sites_center_PS_plus <- binding_sites_center_PS[strand(binding_sites_center_PS)=="+"]binding_sites_center_PS_minus <- binding_sites_center_PS[strand(binding_sites_center_PS)=="-"]# make matrix of BSbinding_sites_center_PS_plus_m <-as.matrix(bw_plus_rle[binding_sites_center_PS_plus])binding_sites_center_PS_minus_m <-as.matrix(bw_minus_rle[binding_sites_center_PS_minus])# calc max for each BS (one BS is one row in the matrix)max_BS_plus <-apply(binding_sites_center_PS_plus_m,1,max)max_BS_minus <-apply(binding_sites_center_PS_minus_m,1,max)# subset for center = maxbinding_sites_center_PSmax_plus <- binding_sites_center_PS_plus[ max_BS_plus == binding_sites_center_PS_plus_m[,3]] binding_sites_center_PSmax_minus <- binding_sites_center_PS_minus[ max_BS_minus == binding_sites_center_PS_minus_m[,3]]############################ Keep only BS with at least 2 crosslink sites############################binding_sites_center_PSmax_plus_m <-as.matrix(bw_plus_rle[binding_sites_center_PSmax_plus])binding_sites_center_PSmax_minus_m <-as.matrix(bw_minus_rle[binding_sites_center_PSmax_minus])crosslink_sites_plus <-apply(binding_sites_center_PSmax_plus_m, 1, function(x) 5-sum(x ==0)) crosslink_sites_minus <-apply(binding_sites_center_PSmax_minus_m, 1, function(x) 5-sum(x ==0)) binding_sites_center_PSmax_plus_2cl <- binding_sites_center_PSmax_plus[crosslink_sites_plus >1]binding_sites_center_PSmax_minus_2cl <- binding_sites_center_PSmax_minus[crosslink_sites_minus >1]############################ BS definition steps table##########################BS_steps_list <-list(final.peaks.plus.gr, final.peaks.minus.gr, binding_sites_center_PS_plus, binding_sites_center_PS_minus, binding_sites_center_PSmax_plus, binding_sites_center_PSmax_minus, binding_sites_center_PSmax_plus_2cl, binding_sites_center_PSmax_minus_2cl )n_BS_steps <-sapply(BS_steps_list, NROW)names_BS_steps <-c("definded BS plus", "defined BS minus", "PureCLIP center plus", "PureClip center minus","PureCLIP max center plus", "PureCLIP max center minus","with 2 CL sites plus", "with 2CL sites minus")table_BS_steps <-cbind(names_BS_steps, n_BS_steps)kable(table_BS_steps)save(table_BS_steps, file =paste0(pureclip_path,"BS_def_steps_oe.RData"))########################### final Binding sites#########################binding_sites_final <-c(binding_sites_center_PSmax_plus_2cl, binding_sites_center_PSmax_minus_2cl)%>%sort# add center sob, pureclip score and cl events# get pureclip sites that overlap with center peakspureclip_sites_overlapping_BS_center <-subsetByOverlaps(pureclip_sites_anno, (binding_sites_final-2)) %>% sort# when both Granges are sorted, they have the same order, # metadata can be moved columnwise without matching# binding_sites_final$gene_id <- pureclip_sites_overlapping_BS_center$gene_idbinding_sites_final$SOB <- pureclip_sites_overlapping_BS_center$SOBbinding_sites_final$score <- pureclip_sites_overlapping_BS_center$scorebinding_sites_final$cl_events_center <- pureclip_sites_overlapping_BS_center$cl_eventssave(binding_sites_final, file=paste0(pureclip_path,"Binding_site_windows_5nt_final.RData"))
7 Sample Reproducibility and Reproducibility filtering
Binding sites are demed reprodrucible, if the they are found in at least three of the four binding sites. The weakest BS per sample with the 0.05 percentile of CL events per BS are not considered for the reproducibility.
per0.05
per0.1
per0.2
variable
5
7
9
clp_rep1
3
4
6
clp_rep2
3
5
7
clp_rep3
5
6
8
clp_rep4
cutoff for the four samples, at 5%, 10% and 20% quantile
Show code
########################## mark samplewise the lowest 20% percentile of cl per BS ########################binding_sites_cl_samples$cl1_overThresh <- binding_sites_cl_samples$clp_rep1 > quantiles_samples[1,3]binding_sites_cl_samples$cl2_overThresh <- binding_sites_cl_samples$clp_rep2 > quantiles_samples[2,3]binding_sites_cl_samples$cl3_overThresh <- binding_sites_cl_samples$clp_rep3 > quantiles_samples[3,3]binding_sites_cl_samples$cl4_overThresh <- binding_sites_cl_samples$clp_rep4 > quantiles_samples[4,3]# in how many samples is the number of cls per bs over the threshold?binding_sites_cl_samples$nsamples_overThresh <-rowSums(cbind(binding_sites_cl_samples$cl1_overThresh, binding_sites_cl_samples$cl2_overThresh, binding_sites_cl_samples$cl3_overThresh, binding_sites_cl_samples$cl4_overThresh))# keep only BS with cls over threshold in at least 3 of the 4 samplesbinding_sites_repro <- binding_sites_cl_samples[binding_sites_cl_samples$nsamples_overThresh>=3,] %>%makeGRangesFromDataFrame(., keep.extra.columns = T)#save(binding_sites_repro, file=paste0(output_path,"final_repro_BS_oe.RData" ))
As can be seen in the reproducibility matrix the BS of all samples correlate well together. The ECDF of the coefficent variation also shows a good reporducibility as a ECDF of 1 is reached at around 0.7 (1 is proposed as a cutoff). The cv depends on the standard deviation. Probably it is more meaningfull to look at the standard error of the experiment> As SE = SD /(number of observations per sample)^-0.5 for 4 samples the SE is half of the SD.
8 Bound Genes and regions
8.1 assign bound gene type
many binding sites overlap with multiple genes (non codings!)
in IGV it look like rather the non-coding RNA than the protein-coding RNA is bound in that case
we therefore use a hieracry on gene types: lncRNA > miRNA > miscRNA > snRNA > snoRNA > protein-coding
Show code
############################# annote binding sites with the bound ( = overlapping) gene################################# overlaps are resolved by:# 1. for two genes of different type by the type hieracy# 2. for to protein coding genes: in this chunk one protein coding gene is assigned at random in the next chunk the region hieracry is used to make this decision and the gene info is changed thenbinding_sites <- binding_sites_repro annotation_gene <- annotation[annotation$type=="gene"]# overlaps of bs with any geneol <-findOverlaps(annotation_gene, binding_sites, ignore.strand=F)# index binding sitesbinding_sites$idx <-1:NROW(binding_sites)# get all possible regions overlapping with crosslinked nucleotide in a temporary filebinding_sites_temp1 <- binding_sites[subjectHits(ol)]binding_sites_temp1 <-sortSeqlevels(binding_sites_temp1)elementMetadata(binding_sites_temp1) <-c(elementMetadata(binding_sites_temp1), elementMetadata(annotation_gene[queryHits(ol), c("gene_id", "gene_type", "gene_name")]))table(binding_sites_temp1$gene_type)
# chose for each binding site a gene from the highest possible hierarchy# if two genes from the same type would be present the first one is chosen (randomly)binding_sites_temp2 <- binding_sites_temp1 %>%as.data.frame(.) %>%# this file contains several entries for 1 binding site overlapping with more than one genegroup_by(idx) %>%# the idx colum is an index of the binding sitesarrange(factor(gene_type, levels =c("lncRNA", "miRNA", "miscRNA", "snRNA", "snoRNA", "protein_coding" )), .by_group = T) %>%# arrange by hierarcy dplyr::slice(1) # choose randomly the first gene (the chosen gene is changed in the next chunk for protein coding genes)binding_sites_temp2 <-makeGRangesFromDataFrame(binding_sites_temp2, keep.extra.columns = T)table(binding_sites_temp2$gene_type)
# non coding rnas "fake" region non codingbs_temp_non_cod <- binding_sites_temp2[binding_sites_temp2$gene_type!="protein_coding",] %>%makeGRangesFromDataFrame(., keep.extra.columns = T)bs_temp_non_cod$type <-"non_coding"
8.2 Binding site region by hiercary approach
Overlap Bs with mutiple regions and choose one with highest priority
# classify only protein coding genes, regions only from bound genesanno_regions <- annotation[annotation$gene_type =="protein_coding"& annotation$type %in%c("three_prime_UTR", "five_prime_UTR", "CDS") ]#get intronsintrons <-intronsByTranscript(anno_txdb, use.names=T) %>%unlist(.)introns$type ="intron"anno_regions <-c(anno_regions, introns)#overlaps of binding sites with protein coding genesbinding_sites_temp_pc <- binding_sites_temp2[binding_sites_temp2$gene_type=="protein_coding"]ol_bs <-findOverlaps(anno_regions, binding_sites_temp_pc-2, ignore.strand =FALSE)# get all possible regions overlapping with bsbinding_sites_temp_pc2 <- binding_sites_temp_pc[subjectHits(ol_bs)]binding_sites_temp_pc2 <-sortSeqlevels(binding_sites_temp_pc2)# get possible region and transcript annotation of bsanno_bs <-elementMetadata(anno_regions[queryHits(ol_bs)]) %>%as.data.frame(.) %>% dplyr::select(c("type", "gene_id"))# and add to temp binding sitescolnames(anno_bs) <-c("type", "gene_id.from_region")elementMetadata(binding_sites_temp_pc2) <-c(elementMetadata(binding_sites_temp_pc[subjectHits(ol_bs)]), anno_bs)# choose type highest in hiearcybinding_sites_temp_pc3 <-as.data.frame(binding_sites_temp_pc2) %>%mutate(., gene_id = gene_id.from_region ) %>%# this throws out regions that overlap with non-coding RNAs, with this line no binding sites in introns!!!group_by(idx) %>%# specific id per binding sitearrange(factor(type, levels =c("three_prime_UTR", "five_prime_UTR", "CDS", "intron")), .by_group = T) %>% dplyr::slice(1) # arrange by region, select first, transcript info is ignored herebinding_sites_temp_pc3 <-makeGRangesFromDataFrame(binding_sites_temp_pc3, keep.extra.columns = T)sortSeqlevels(binding_sites_temp_pc3)
# merge BS from non-coding RNAs and binding sites with specific regionsbinding_sites_with_regions <-c(binding_sites_temp_pc3, bs_temp_non_cod)# some protein coding genes not have a region, because no trustworth transcript is annotated in this position of the genebs_without_region <- binding_sites[-queryHits(findOverlaps(binding_sites, binding_sites_with_regions))]
---title: "Binding site definition on IH-AB FLAG-PURA overexpression data set "date: "`r format(Sys.time(), '%d %B, %Y')`"author: - name: Melina Klostermannformat: html: code-fold: true code-overflow: scroll code-summary: "Show code" code-tools: true code-line-numbers: true toc: true toc-depth: 3 toc-location: left toc-expand: false number-sections: true theme: sandstone fontsize: 11pt linestretch: 1.5 fig-format: svg cap-location: margin crossref: fig-title: Fig embed-resources: true link-external-newwindow: true smooth-scroll: true execute: echo: true warning: false---```{r setup, include=FALSE}require("knitr")opts_knit$set(root.dir ="/Users/melinaklostermann/Documents/projects/Thesis/Thesis_code/PURA/01_binding_site_definition_oePURA")knitr::opts_chunk$set(warning=FALSE, message=FALSE, cache=TRUE, tidy.opts=list(width.cutoff=80))library(ggplot2) source("/Users/melinaklostermann/Documents/projects/R_general_functions/theme_thesis.R")theme_set(theme_thesis())``````{r libraries, include=FALSE}library(GenomicRanges)library(rtracklayer)library(knitr)library(GenomicFeatures)library(dplyr)library(ggpubr)library(Gviz)```# What was done in this report?In this script I define PURA binding sites from the IH-AB FLAG-PURA iCLIP data set. The following steps were performed in this order:1. Definition of Binding sites and filtering + PureCLIP peak calling (on merged sample1+2 sample3+4) + Global filter of pureclip sites (>5 cl events on pureclip site) + Local (genewise) filter of pureclip sites ( > 75% percentile of gene) + Binding site definition (5nt, Matrix approach, discard BS where center is not max pureclip, has less then 2 sites with crosslinks) + Reproducibility filtering (soft boundary: 0.05% percentile of cl per binding site per sample, repro in min 3 of 4 samples) + Comparison with old BS definition pipeline 2. Analysis of biniding behaviour + Analysis of bound gene types + Analysis of bound gene regions# Input```{r input, echo=TRUE}raw_path <-"/Users/melinaklostermann/Documents/projects/PURA/01_raw_data/oe-imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/"# crosslink sites in bw format###########################################all 4 sample merged by Ankebw_all_plus_path <-paste0(raw_path, "merged/bw/imb_koenig_2019_11_allsamples.v2uniqMD.duprm.plus.bw")bw_all_minus_path <-paste0(raw_path, "merged/bw/imb_koenig_2019_11_allsamples.v2uniqMD.duprm.minus.bw")# single samplesbw_1_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample1.v2uniqMD.duprm.plus.bw")bw_1_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample1.v2uniqMD.duprm.minus.bw")bw_2_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample2.v2uniqMD.duprm.plus.bw")bw_2_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample2.v2uniqMD.duprm.minus.bw")bw_3_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample3.v2uniqMD.duprm.plus.bw")bw_3_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample3.v2uniqMD.duprm.minus.bw")bw_4_plus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample4.v2uniqMD.duprm.plus.bw")bw_4_minus_path <-paste0(raw_path,"bw/all/DR/imb_koenig_2019_11_sample4.v2uniqMD.duprm.minus.bw")#bw_all_samples_path <-"bw_4samples.RData"bw_merges_path <-"bw_merges.RData"# pureclip calls ####################################(obtained by running pureclip on pseudo samples 1u2 and 3u4 see below)pureclip_path <-"/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/01-BS_def/01-BS_def_oe/PureCLIP"# gencode annotation v31################################## this is the filtered annotation as used in molitor et alannotation <-readRDS("/Users/melinaklostermann/Documents/projects/PURA/Molitor-et-al-2022/annotation.rds")anno_txdb <-makeTxDbFromGRanges(annotation)output_path <-"/Users/melinaklostermann/Documents/projects/Thesis/Thesis_code/PURA/01_binding_site_definition_oePURA/"```# Look on raw bwssample1 is the strongest and sample2 the weakest sample. Still all samples have a very nice depth in signal.```{r import_bws, eval=FALSE, include=FALSE}bw_1_plus <-import.bw(bw_1_plus_path)bw_1_minus <-import.bw(bw_1_minus_path)strand(bw_1_plus) <-"+"strand(bw_1_minus) <-"-"bw_2_plus <-import.bw(bw_2_plus_path)bw_2_minus <-import.bw(bw_2_minus_path)strand(bw_2_plus) <-"+"strand(bw_2_minus) <-"-"bw_3_plus <-import.bw(bw_3_plus_path)bw_3_minus <-import.bw(bw_3_minus_path)strand(bw_3_plus) <-"+"strand(bw_3_minus) <-"-"bw_4_plus <-import.bw(bw_4_plus_path)bw_4_minus <-import.bw(bw_4_minus_path)strand(bw_4_plus) <-"+"strand(bw_4_minus) <-"-"bw_all_plus <-import.bw(bw_all_plus_path)bw_all_minus <-import.bw(bw_all_minus_path)strand(bw_all_plus) <-"+"strand(bw_all_minus) <-"-"#! bw merges neibouring sites with same score -> split, readd scoressplit_bw_crosslinks_to_1_nt <-function(bw){# split ranges in 1 nt events bw_split <- exomeCopy::subdivideGRanges(bw, subsize=1)# match scores by an overlap index idx <-findOverlaps(bw_split, bw)# add scores bw_split$score <- bw[subjectHits(idx)]$score# readd strand infostrand(bw_split) <-strand(bw)return(bw_split)}bw_all_samples <-list(bw_1_plus, bw_1_minus, bw_2_plus, bw_2_minus, bw_3_plus, bw_3_minus, bw_4_plus, bw_4_minus) %>%lapply(., function(x) split_bw_crosslinks_to_1_nt(bw=x))bw_merges <-list(bw_all_plus, bw_all_minus) %>%lapply(., function(x) split_bw_crosslinks_to_1_nt(bw=x))save(bw_all_samples, file = bw_all_samples_path)save(bw_merges, file = bw_merges_path)``````{r overview_bws, echo=FALSE}load(bw_all_samples_path)load(bw_merges_path)bw_overview_list <-c(bw_all_samples, bw_merges)bw_overview_df <-data.frame(sample=c("sample 1 +", "sample 1 -", "sample 2 +", "sample 2 -", "sample 3 +", "sample 3 -", "sample 4 +", "sample 4 -", "all samples +", "all samples -" ), CLsites =sapply(bw_overview_list, length), CLevents =sapply(bw_overview_list, function(x){ return(sum(x$score))}))bw_overview_df <-cbind(bw_overview_df, CLeventsPerSite = bw_overview_df$CLevents/bw_overview_df$CLsites)kable(bw_overview_df, caption ="Distribution of CL sites and events in the 4 samples")```# PureCLIP peak callingPureCLIP was performed on the bam files of 2 pseudo samples (as it can only use 2 samples at the moment): A merge of sample 1 and 2 and a merge of sample 3 and 4. The bam files of all 4 samples were obtained from Ankes pipeline. The files were merged with samtools merge.This is the bash code that was executed on the server:```{r, engine="bash", eval=FALSE, include=TRUE}# Mergemelina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$ conda activate(base) melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$ samtools merge sample1u2.v2uniqMD.bam imb_koenig_2019_11_sample1.v2uniqMD.duprm.bam imb_koenig_2019_11_sample2.v2uniqMD.duprm.bam (base) melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$ samtools merge sample3u4.v2uniqMD.bam imb_koenig_2019_11_sample3.v2uniqMD.duprm.bam imb_koenig_2019_11_sample4.v2uniqMD.duprm.bam(base) melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$ samtools index sample1u2.v2uniqMD.bam (base) melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR$ samtools index sample3u4.v2uniqMD.bam <p># Copy to share projectmelina@gateway:/share/project/zarnack/melina/pureclip_pura_oe_pip2$ cp ~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/bam/all/DR/sample* .# Get genome annotationmelina@gateway:/share/project/zarnack/melina/pureclip_pura_oe_pip2$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/GRCh38.p12.genome.fa.gzmelina@gateway:/share/project/zarnack/melina/pureclip_pura_oe_pip2$ gunzip GRCh38.p12.genome.fa.gz#PureCLIP Script#!/bin/bash <p>#SBATCH --partition=all <p>#SBATCH --cpus-per-task=6 <p>#SBATCH --mem-per-cpu=32gb <p>#SBATCH --job-name="PureCLIP_PURA_oe_pip2" <p>#SBATCH --array=1 <p>#SBATCH --output=pureclip_pura_o2_pip2.out # Standard output and error log <p>#SBATCH --error=pureclip_pura_oe_pip2.err <p>#SBATCH --mail-user=melinaklostermann@googlemail.com <p>#SBATCH --mail-type=ALL <p>echo This is task \$SLURM_ARRAY_TASK_IDconda activatepureclip -i /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample1u2.v2uniqMD.bam -bai /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample1u2.v2uniqMD.bam.bai -i /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample3u4.v2uniqMD.bam-bai /share/project/zarnack/melina/pureclip_pura_oe_pip2/sample3u4.v2uniqMD.bam.bai -g /share/project/zarnack/melina/pureclip_pura_oe_pip2/GRCh38.p12.genome.fa -o peakcalling_pura_oe_pip2_sites.bed -or peakcalling_pura_oe_pip2_regions.bed -nt 10#Run job(base) melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/PureCLIP_PURA_oe_pip2$ sbatch jobscript_Pureclip_PURA_oe_pip2.sh <p>(base) melina@gateway:~/imb_koenig_2019_11_koenig_iCLIP_PURA___v2uniqMD/PureCLIP_PURA_oe_pip2$ squeue``````{r load_pureCLIP}pureclip_sites <-import(paste0(pureclip_path,"/peakcalling_pura_oe_pip2_sites.bed"), format ="bedgraph")#clean up columnspureclip_sites <-as.data.frame(pureclip_sites) %>%makeGRangesFromDataFrame(keep.extra.columns = T)pureclip_sites$NA..2<-NULLpureclip_sites$score <- pureclip_sites$NA.pureclip_sites$NA. <-NULLstrand(pureclip_sites) <- pureclip_sites$NA..1pureclip_sites$NA..1<-NULLpureclip_sites$round_score <-round(pureclip_sites$score, digits =1)pureclip_sites <-keepStandardChromosomes(pureclip_sites, pruning.mode ="coarse")pureclip_sites```# Add annotated genes to PureCLIP sitesPureCLIP calls are loaded into R. The metadata of the resulting GRAnges object is cleaned up, as the columns get wrong names and empty columns are intruduced, when importing .bed files as GRanges objects (not shown). <p>Gene annotations are megred to PureCLIP sites. For this the annotation is filtered for overlapping genes. Of two overlaping genes the longer is kept.```{r POB_calc}annotation_genes <- annotation[annotation$type=="gene"] # sortiere anno nach laenge, damit bei uberlappenden genen das laengere (erste) behalten wirdannotation_genes <-sort(annotation_genes, by =~ width + seqnames + start)# remove ueberlappende geneannotation_genes$dup <-duplicated.GenomicRanges(annotation_genes)annotation_genes_dup_rem <- annotation_genes[annotation_genes$dup == F]# get genes that belong to pureclip sitepureclip_sites_anno <-subsetByOverlaps(pureclip_sites, annotation_genes_dup_rem)# if start of gene 1 = end of gene 2 keep firstidx <-findOverlaps(pureclip_sites_anno, annotation_genes_dup_rem, ignore.strand =FALSE, minoverlap = 1L, select ="first")# remove pureclip sites that are not annotated on a genepureclip_sites_anno[isNA(idx)] <-NULLidx <-na.omit(idx)# combine metadata of gene anno und pureclip siteselementMetadata(pureclip_sites_anno) <-cbind(elementMetadata(pureclip_sites_anno),elementMetadata(annotation_genes_dup_rem[idx]))# remove empty columns (empty for genes)pureclip_sites_anno$transcript_id <-NULLpureclip_sites_anno$transcript_type <-NULLpureclip_sites_anno$transcript_name <-NULLpureclip_sites_anno$transcript_support_level <-NULLpureclip_sites_anno$havana_transcript <-NULLpureclip_sites_anno$hgnc_id <-NULLpureclip_sites_anno$exon_number <-NULLpureclip_sites_anno$exon_id <-NULLpureclip_sites_anno$protein_id <-NULLpureclip_sites_anno$ccdsid <-NULLpureclip_sites_anno$ont <-NULLsave(pureclip_sites_anno, file=paste0(output_path,"peakcalling_pura_oe_sites_with_geneanno.RData"))save(annotation_genes_dup_rem, file=paste0(output_path,"annotation.RData"))```## How many Pureclip sites are there per gene?A problem for this normalisation are obviously genes with very few or only one pureclip call. This is the distribution of PureCLIP calls per gene: <p>```{r echo=FALSE}#How many pureclip calls per geneload(paste0(output_path,"peakcalling_pura_oe_sites_with_geneanno.RData"))pureclip_sites_per_gene <- pureclip_sites_anno %>%as.data.frame(.) %>%group_by(gene_id) %>%summarise(ps_gene_num =n())ggplot(pureclip_sites_per_gene, aes(x=ps_gene_num))+geom_histogram(binwidth =1)+ ggforce::facet_zoom(xlim=c(0, 100))+ggtitle("Distribution Pureclip sites per Gene")```Of all bound genes `r round(sum(pureclip_sites_per_gene$ps_gene_num >5)/nrow(pureclip_sites_per_gene), 4)*100`% contain more than 5 PureCLIP calls. Therefore, the calculation of the SOB on PureCLIP sites should work fine. # iCLIP2: loads of signal -- Global and local (genewide) filterAs this data set has a very high signal depth (which is nice), also potential background signals are amplified. In order to focus on strong binding events, weaker PureCLIP sites are filtered out. A global and a local (genewide) filter are applied. The idea of the gene wide filter is that genes which were less abundant within the samples, will also have a weaker signal. A strong global filter would therefore not only filter strong binding events, but might introduce a bias towards strongly expressed genes. In the local filter one can filter more stringendly without introducing this bias. However, one has to keep in mind that the PURA homodimer in theory has 6 RNA binding domains, and its binding to a RNA might also depent on two or more adjecant binding sites.## Global filterTo choose a global filter, we can look at the distribution of the mean cl events on pureclip sites per gene and the quantiles of cl events on all pureclip sites.```{r cl_mean_per_gene, echo=FALSE}# distribution of pureclip scorepureclip_sites_anno <- pureclip_sites_anno %>%as.data.frame(.)ggplot(pureclip_sites_anno, aes(x=score))+geom_histogram(binwidth =1) + ggforce::facet_zoom(xlim=c(0,20))+ggtitle("Distribution of Genewise CL mean on pureclip sites")# CL quantiles q <-quantile(pureclip_sites_anno$score, probs =seq(0,1, by=0.05))# Apply global filter of 5 CL events per pureclip sitepureclip_sites_anno <- pureclip_sites_anno[pureclip_sites_anno$score>q["5%"],]```As the lowest mean cl events on pureclip sites per gene are 4 and 5, a filter above 5 will already remove no background signal on these genes. As the data is from 4 replicates 5 cl events, could mean 3 samples with 1 cl and one sample with 2 cl (other distributions are also possible), which should be a minimum to trust a peak. If a filter of 5 cl events per pureclip site is applied `r round(nrow(pureclip_sites_anno[pureclip_sites_anno$cl_events < 6,]) / nrow(pureclip_sites_anno), digits = 4)*100` % of the pureclip sites are discarded.## Genewise filterHow many BS per gene make sense? <p>* Pura dimer has in theory 6 RNA Binding sites, could also bind several times the same RNAThe advantage of the genewise filter is, that it will not introduce a bias towards strongly expressed genes. However, a to stringend genewise filter might not account for more complex binding patterns. As PURA might bind at more than one site at the same time and a binding site is more than 1 nt long, the genewise filter should allow several PureCLIp sites per gene.On the other hand for the intended following analysis in theory one BS per gene would be sufficient.To choose a suitable percentile for filtering, percentiles in steps of 5 are calculated for all genes. These can then be plotted in a boxplot to see their distribution: <p>```{r genewise_filter, eval=TRUE, include=TRUE}##################filter pureclip hits local################make purr map with different percentilesp <-seq(from=0, to=1, by=0.05)p_names <- purrr::map_chr(p, ~paste0(.x*100, "%"))#write a function with purrr to calulate the different percentilesp_funs <- purrr::map(p, ~purrr::partial(quantile, probs = .x, na.rm =TRUE)) %>% purrr::set_names(nm = p_names)#use dyplr to calculate different percentiles for each genequant_genewise <- pureclip_sites_anno %>%as.data.frame(.) %>% dplyr::group_by(gene_id) %>% dplyr::summarize_at("score", funs(!!!p_funs))%>% as.data.framehead(quant_genewise)# use cl event 75% quantile for filteringquant_genewise_75 <-data.frame(gene_id = quant_genewise$gene_id, score_75_quantile = quant_genewise$`75%`)pureclip_sites_anno<-merge(pureclip_sites_anno, quant_genewise_75, by="gene_id")pureclip_sites_anno <- pureclip_sites_anno[ pureclip_sites_anno$score > pureclip_sites_anno$score_75_quantile,]save(pureclip_sites_anno, file=paste0(output_path, "peakcalling_pura_oe_sites_anno_filtered.RData"))rm(pureclip_sites_per_gene, pureclip_sites_per_gene_plot, quant_genewise, quant_genewise_75, quant_rel, quant_rel_gg)```In the end the 75% lower percentile per gene is filtered out.# Binding site definitionBinding sites are obtained, by merging all pureclip sites with a gap <= 8, and then splitting up the large BS obtained from the merging. The large binding sites are split up using a rle matrix. This approach requires afterwards to throw out Binding sites (BS) whose center is no Pureclip site or does not have the maximun Pureclip score.```{r BS_definition_function, eval=FALSE, include=TRUE}#################################### Binding site Definition####################################Parameters:#pureclip - Granges containing PureCLIP sites# bw_plus, bw_minus - path to bw files# windowsize - size thet Bindingsites should have (should be uneven)Define_Binding_Sites <-function(pureclip, bw_plus, bw_minus, windowsize, out){# Merge Gaps < 8 from single pureclip sitespureclip =reduce(pureclip, min.gapwidth =8)#remove sites with 1 or 2 nt lengthpureclip = pureclip[width(pureclip) >2] bw_plus =import.bw(bw_plus, as="Rle") bw_minus =import.bw(bw_minus, as="Rle") final.peaks.plus.gr <-GRanges() final.peaks.minus.gr <-GRanges()#Initialize the remaining PureCLIP CL regions to check for peaks remaining.regions.plus.gr <-subset(pureclip, strand =="+") remaining.regions.minus.gr <-subset(pureclip, strand =="-") window.radius <- (windowsize-1)/2while(TRUE){#No regions left to check for peaksif (length(remaining.regions.plus.gr) ==0&length(remaining.regions.minus.gr) ==0){break }if (length(remaining.regions.plus.gr) !=0 ){#Get the raw CL counts in the remaining PureCLIP CL regions# returns rle list of all regions and turns it into matrix raw.remaining.PureCLIP.CL.regions.plus.m <-as.matrix(bw_plus[remaining.regions.plus.gr])#Identify the center of the PureCLIP CL regions (Position with max counts)# and store its indice raw.remaining.PureCLIP.CL.regions.plus.m[is.na(raw.remaining.PureCLIP.CL.regions.plus.m)] <--Inf# set Na to -infinite max.pos.indice.plus <-max.col(raw.remaining.PureCLIP.CL.regions.plus.m, ties.method ="first")#Create a peak region of xnt that is centered to the max position peaks.plus.gr <- remaining.regions.plus.grstart(peaks.plus.gr) <-start(peaks.plus.gr) + max.pos.indice.plus -1end(peaks.plus.gr) <-start(peaks.plus.gr) peaks.plus.gr <- peaks.plus.gr + window.radius#Store the new peaks final.peaks.plus.gr <-c(final.peaks.plus.gr, peaks.plus.gr)#Remove the peaks from the CL regions to search for additional peaks#Excise additionally x nucleotides up and downstream peaks.plus.grl <-as(peaks.plus.gr+window.radius, "GRangesList") remaining.regions.plus.gr <-unlist(psetdiff(remaining.regions.plus.gr, peaks.plus.grl)) }if (length(remaining.regions.minus.gr) !=0 ){#Get the raw CL counts in the remaining PureCLIP CL regions# returns rle list of all regions and turns it into matrix raw.remaining.PureCLIP.CL.regions.minus.m <-as.matrix( bw_minus[remaining.regions.minus.gr])#Identify the center of the PureCLIP CL regions (Position with max counts) #and store its indice raw.remaining.PureCLIP.CL.regions.minus.m[is.na(raw.remaining.PureCLIP.CL.regions.minus.m)] <--Inf max.pos.indice.minus <-max.col(raw.remaining.PureCLIP.CL.regions.minus.m, ties.method ="last")#Create a peak region of xnt that is centered to the max position peaks.minus.gr <- remaining.regions.minus.grstart(peaks.minus.gr) <-start(peaks.minus.gr) + max.pos.indice.minus -1end(peaks.minus.gr) <-start(peaks.minus.gr) peaks.minus.gr <- peaks.minus.gr + window.radius#Store the new peaks final.peaks.minus.gr <-c(final.peaks.minus.gr, peaks.minus.gr)#Remove the peaks from the CL regions to search for additional peaks#Excise additionally x nucleotides up and downstream peaks.minus.grl <-as(peaks.minus.gr+window.radius, "GRangesList") remaining.regions.minus.gr <-unlist(psetdiff(remaining.regions.minus.gr, peaks.minus.grl)) } }export(final.peaks.plus.gr, con=paste(out,"_plus.bed", sep =""), format ="bed")export(final.peaks.minus.gr, con=paste(out,"_minus.bed", sep =""), format ="bed")save(final.peaks.minus.gr, file=paste(out,"_plus.RData", sep =""))save(final.peaks.plus.gr, file=paste(out,"_minus.RData", sep ="")) returnlist <-list(peaks.minus = final.peaks.minus.gr, peaks.plus = final.peaks.plus.gr)return(returnlist)}pureclip_sites_anno <-makeGRangesFromDataFrame(pureclip_sites_anno,keep.extra.columns =TRUE)binding_sites <-Define_Binding_Sites(pureclip = pureclip_sites_anno, bw_plus = bw_all_plus_path, bw_minus = bw_all_minus_path,windowsize =5, out =paste0(output_path,"Binding_site_windows_5nt" ))```## Throw out BS where not pureclip site or not max pureclip site```{r discard_wrong_BS, eval=FALSE, include=TRUE}load(paste0(output_path,"Binding_site_windows_5nt_plus.RData"))load(paste0(output_path,"Binding_site_windows_5nt_minus.RData"))############################# Keep only BS with PureCLIP center############################# get all BSbinding_sites <-c(final.peaks.minus.gr, final.peaks.plus.gr)#get centersBS_centers <- binding_sites -2#keep only overlaps with pureclip sitespureclip_sites_anno <-makeGRangesFromDataFrame(pureclip_sites_anno, keep.extra.columns =TRUE)binding_sites_center_PS <- binding_sites[queryHits(findOverlaps( BS_centers, pureclip_sites_anno))]############################ Keep only BS with max PureCLIP site at center########################### get bw rlesbw_plus_rle <-import.bw(bw_all_plus_path, as="Rle")bw_minus_rle <-import.bw(bw_all_minus_path, as="Rle")# split BS by strandbinding_sites_center_PS_plus <- binding_sites_center_PS[strand(binding_sites_center_PS)=="+"]binding_sites_center_PS_minus <- binding_sites_center_PS[strand(binding_sites_center_PS)=="-"]# make matrix of BSbinding_sites_center_PS_plus_m <-as.matrix(bw_plus_rle[binding_sites_center_PS_plus])binding_sites_center_PS_minus_m <-as.matrix(bw_minus_rle[binding_sites_center_PS_minus])# calc max for each BS (one BS is one row in the matrix)max_BS_plus <-apply(binding_sites_center_PS_plus_m,1,max)max_BS_minus <-apply(binding_sites_center_PS_minus_m,1,max)# subset for center = maxbinding_sites_center_PSmax_plus <- binding_sites_center_PS_plus[ max_BS_plus == binding_sites_center_PS_plus_m[,3]] binding_sites_center_PSmax_minus <- binding_sites_center_PS_minus[ max_BS_minus == binding_sites_center_PS_minus_m[,3]]############################ Keep only BS with at least 2 crosslink sites############################binding_sites_center_PSmax_plus_m <-as.matrix(bw_plus_rle[binding_sites_center_PSmax_plus])binding_sites_center_PSmax_minus_m <-as.matrix(bw_minus_rle[binding_sites_center_PSmax_minus])crosslink_sites_plus <-apply(binding_sites_center_PSmax_plus_m, 1, function(x) 5-sum(x ==0)) crosslink_sites_minus <-apply(binding_sites_center_PSmax_minus_m, 1, function(x) 5-sum(x ==0)) binding_sites_center_PSmax_plus_2cl <- binding_sites_center_PSmax_plus[crosslink_sites_plus >1]binding_sites_center_PSmax_minus_2cl <- binding_sites_center_PSmax_minus[crosslink_sites_minus >1]############################ BS definition steps table##########################BS_steps_list <-list(final.peaks.plus.gr, final.peaks.minus.gr, binding_sites_center_PS_plus, binding_sites_center_PS_minus, binding_sites_center_PSmax_plus, binding_sites_center_PSmax_minus, binding_sites_center_PSmax_plus_2cl, binding_sites_center_PSmax_minus_2cl )n_BS_steps <-sapply(BS_steps_list, NROW)names_BS_steps <-c("definded BS plus", "defined BS minus", "PureCLIP center plus", "PureClip center minus","PureCLIP max center plus", "PureCLIP max center minus","with 2 CL sites plus", "with 2CL sites minus")table_BS_steps <-cbind(names_BS_steps, n_BS_steps)kable(table_BS_steps)save(table_BS_steps, file =paste0(pureclip_path,"BS_def_steps_oe.RData"))########################### final Binding sites#########################binding_sites_final <-c(binding_sites_center_PSmax_plus_2cl, binding_sites_center_PSmax_minus_2cl)%>%sort# add center sob, pureclip score and cl events# get pureclip sites that overlap with center peakspureclip_sites_overlapping_BS_center <-subsetByOverlaps(pureclip_sites_anno, (binding_sites_final-2)) %>% sort# when both Granges are sorted, they have the same order, # metadata can be moved columnwise without matching# binding_sites_final$gene_id <- pureclip_sites_overlapping_BS_center$gene_idbinding_sites_final$SOB <- pureclip_sites_overlapping_BS_center$SOBbinding_sites_final$score <- pureclip_sites_overlapping_BS_center$scorebinding_sites_final$cl_events_center <- pureclip_sites_overlapping_BS_center$cl_eventssave(binding_sites_final, file=paste0(pureclip_path,"Binding_site_windows_5nt_final.RData"))```# Sample Reproducibility and Reproducibility filteringBinding sites are demed reprodrucible, if the they are found in at least three of the four binding sites. The weakest BS per sample with the 0.05 percentile of CL events per BS are not considered for the reproducibility. ```{r compare_cL_per_nt, echo=FALSE}# merge plus and minus of bws of all 4 samples, output: list with bws of the 4 samplesbw_all_samples_plmi <-list(c(bw_all_samples[[1]], bw_all_samples[[2]]), c(bw_all_samples[[3]], bw_all_samples[[4]]), c(bw_all_samples[[5]], bw_all_samples[[6]]), c(bw_all_samples[[7]], bw_all_samples[[8]]))# get total number of cl events per samplen_total_cl_events_per_sample <-sapply(bw_all_samples_plmi, function(x) sum(x$score))# get score of cl events per nt sample_scores <-data.frame(cl_score =c(bw_all_samples_plmi[[1]]$score, bw_all_samples_plmi[[2]]$score, bw_all_samples_plmi[[3]]$score, bw_all_samples_plmi[[4]]$score), sample =c(rep("sample1", NROW(bw_all_samples_plmi[[1]])), rep("sample2", NROW(bw_all_samples_plmi[[2]])),rep("sample3", NROW(bw_all_samples_plmi[[3]])),rep("sample4", NROW(bw_all_samples_plmi[[4]]))))sample_scores[sample_scores$cl_score >20,]$cl_score <-20# get total number of cl events per samplen_total_cl_events_per_sample <-sapply(bw_all_samples_plmi, function(x) sum(x$score))plot_compare_cL_per_nt <-ggplot(sample_scores, aes(x=cl_score)) +geom_histogram(binwidth=1, colour="#999999", fill="#E69F00")+facet_wrap(~ sample)+labs(x="cl events / BS", y="frequency")+theme_bw()+ggtitle("Distribution of cl events per BS")plot_compare_cL_per_nt``````{r compare_cl_per_BS, echo=FALSE}#get bws as rlessample1.minus.rle <-import.bw( bw_1_minus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample2.minus.rle <-import.bw( bw_2_minus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample3.minus.rle <-import.bw( bw_3_minus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample4.minus.rle <-import.bw( bw_4_minus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample1.plus.rle <-import.bw( bw_1_plus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample2.plus.rle <-import.bw( bw_2_plus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample3.plus.rle <-import.bw( bw_3_plus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")sample4.plus.rle <-import.bw( bw_4_plus_path, as="Rle") %>%keepStandardChromosomes(pruning.mode ="coarse")#Sum up cl events per binding site load(paste0(pureclip_path,"Binding_site_windows_5nt_final.RData"))bs.p = binding_sites_final[strand(binding_sites_final) =="+"]bs.p$clp_rep1 = sample1.plus.rle[bs.p] %>% sumbs.p$clp_rep2 = sample2.plus.rle[bs.p] %>% sumbs.p$clp_rep3 = sample3.plus.rle[bs.p] %>% sumbs.p$clp_rep4 = sample4.plus.rle[bs.p] %>% sumbs.m = binding_sites_final[strand(binding_sites_final) =="-"]bs.m$clp_rep1 = sample1.minus.rle[bs.m] %>% sumbs.m$clp_rep2 = sample2.minus.rle[bs.m] %>% sumbs.m$clp_rep3 = sample3.minus.rle[bs.m] %>% sumbs.m$clp_rep4 = sample4.minus.rle[bs.m] %>% sum#Combinebinding_sites_cl_samples =c(bs.p, bs.m)# Caclulate percentile based thresholdquantiles_samples =data.frame(per0.05 =c(quantile(binding_sites_cl_samples$clp_rep1, prob=seq(0,1, by =0.05))[2],quantile(binding_sites_cl_samples$clp_rep2, prob=seq(0,1, by =0.05))[2],quantile(binding_sites_cl_samples$clp_rep3, prob=seq(0,1, by =0.05))[2],quantile(binding_sites_cl_samples$clp_rep4, prob=seq(0,1, by =0.05))[2]),per0.1 =c(quantile(binding_sites_cl_samples$clp_rep1, prob =seq(0,1, by =0.1))[2],quantile(binding_sites_cl_samples$clp_rep2, prob =seq(0,1, by =0.1))[2],quantile(binding_sites_cl_samples$clp_rep3, prob =seq(0,1, by =0.1))[2],quantile(binding_sites_cl_samples$clp_rep4, prob =seq(0,1, by =0.1))[2]),per0.2 =c(quantile(binding_sites_cl_samples$clp_rep1, prob =seq(0,1, by =0.1))[3],quantile(binding_sites_cl_samples$clp_rep2, prob =seq(0,1, by =0.1))[3],quantile(binding_sites_cl_samples$clp_rep3, prob =seq(0,1, by =0.1))[3],quantile(binding_sites_cl_samples$clp_rep4, prob =seq(0,1, by =0.1))[3]),variable =c("clp_rep1", "clp_rep2", "clp_rep3", "clp_rep4"))kable(quantiles_samples, caption="cutoff for the four samples, at 5%, 10% and 20% quantile")# make df of cl events per bs and plotdf_samples_cl_per_bs <-data.frame(clp_rep1 = binding_sites_cl_samples$clp_rep1, clp_rep2 = binding_sites_cl_samples$clp_rep2, clp_rep3 = binding_sites_cl_samples$clp_rep3, clp_rep4 = binding_sites_cl_samples$clp_rep4) %>% reshape2::melt()df_samples_cl_per_bs[df_samples_cl_per_bs$value>40,]$value <-40labs <-c("sample 1", "sample 2", "sample 3", "sample 4")names(labs) <-c("clp_rep1", "clp_rep2", "clp_rep3", "clp_rep4")plot_samples_cl_per_bs <-ggplot(df_samples_cl_per_bs, aes(x = value, group = variable)) +geom_bar(position ="dodge", width=1) +facet_wrap(~variable, labeller =labeller(variable = labs), scales="fixed") +geom_vline(data = quantiles_samples[1:4,], aes(xintercept = per0.05), color ="goldenrod") +geom_vline(data = quantiles_samples[1:4,], aes(xintercept = per0.1), color ="deepskyblue") +geom_vline(data = quantiles_samples[1:4,], aes(xintercept = per0.2), color ="darkblue") +# scale_colour_identity(name="percentiles",breaks=c(report_color[3], report_color[10],report_color[13]),# labels=c("5%", "10%", "20%"), guide = "legend")+xlab("crosslinks per binding site")+ylab("frequency")+theme_bw()+theme( axis.title.y =element_text(margin =margin(t =0, r =10, b =0, l =0)))+ggtitle("Distribution of number of cl events per binding site in the 4 samples")plot_samples_cl_per_bs ``````{r repro_Upset, echo=FALSE}bs <- binding_sites_cl_samplesbs$names <-1:length(bs)UpSet_List_cutoff20 =list(rep1 = bs[bs$clp_rep1>= quantiles_samples[1,3]]$names,rep2 = bs[bs$clp_rep2>= quantiles_samples[2,3]]$names,rep3 = bs[bs$clp_rep3>= quantiles_samples[3,3]]$names,rep4 = bs[bs$clp_rep4>= quantiles_samples[4,3]]$names)UpSetR::upset(UpSetR::fromList(UpSet_List_cutoff20), order.by =c("degree","freq"), nsets =4)grid.text("Overlap of binding sites between the 4 samples", x =0.65, y=0.95, gp=gpar(fontsize=16))``````{r filter_repro}########################## mark samplewise the lowest 20% percentile of cl per BS ########################binding_sites_cl_samples$cl1_overThresh <- binding_sites_cl_samples$clp_rep1 > quantiles_samples[1,3]binding_sites_cl_samples$cl2_overThresh <- binding_sites_cl_samples$clp_rep2 > quantiles_samples[2,3]binding_sites_cl_samples$cl3_overThresh <- binding_sites_cl_samples$clp_rep3 > quantiles_samples[3,3]binding_sites_cl_samples$cl4_overThresh <- binding_sites_cl_samples$clp_rep4 > quantiles_samples[4,3]# in how many samples is the number of cls per bs over the threshold?binding_sites_cl_samples$nsamples_overThresh <-rowSums(cbind(binding_sites_cl_samples$cl1_overThresh, binding_sites_cl_samples$cl2_overThresh, binding_sites_cl_samples$cl3_overThresh, binding_sites_cl_samples$cl4_overThresh))# keep only BS with cls over threshold in at least 3 of the 4 samplesbinding_sites_repro <- binding_sites_cl_samples[binding_sites_cl_samples$nsamples_overThresh>=3,] %>%makeGRangesFromDataFrame(., keep.extra.columns = T)#save(binding_sites_repro, file=paste0(output_path,"final_repro_BS_oe.RData" ))``````{r plot_repro_ecdf, echo=FALSE}########################## ecdf plot#########################ecdf_df <-data.frame(s1 = binding_sites_repro$clp_rep1, s2 = binding_sites_repro$clp_rep2, s3 = binding_sites_repro$clp_rep3, s4 = binding_sites_repro$clp_rep4) # calucalte coefficient of variation # cv = sd/meancv_samples_clpBS <-apply(ecdf_df[,1:4], 1, function(x) sd(x)/mean(x))ecdf_df$cv <- cv_samples_clpBS#standard error?meansem_samples_clpBS <-apply(ecdf_df[,1:4], 1, function(x) plotrix::std.error(x) /mean(x))ecdf_df$sem <- sem_samples_clpBSggplot(ecdf_df)+stat_ecdf(aes(x=cv, color="darkred"), geom ="step" )+stat_ecdf(aes(x=sem, color="orange"), geom="step")+ggtitle("ECDF plot of coefficient variation between the 4 samples")+scale_colour_manual(values=c("darkred", "orange"))+theme_bw()+xlab("")+scale_color_identity(name ="on x-achis",breaks =c("darkred", "orange"),labels =c("coeff of variation", "SE/mean"),guide ="legend")``````{r plot_repro_matrix, eval=FALSE, include=FALSE}######################### correlation matrix########################repro_scatter_df <- ecdf_df[,1:4] %>%mutate(s1 =log2(s1), s2 =log2(s2), s3=log2(s3), s4=log2(s4)) %>%mutate(s1 =case_when(s1==-Inf~0, T ~ s1),s2 =case_when(s2==-Inf~0, T ~ s2),s3 =case_when(s3==-Inf~0, T ~ s3),s4 =case_when(s4==-Inf~0, T ~ s4))scatter_fn <-function(data, mapping, ...){ p <-ggplot(data = data, mapping = mapping) +geom_point()+stat_density2d(aes(fill=..level..), geom="polygon") +coord_cartesian(xlim =c(0,12.5), ylim =c(0,12.5))+geom_abline(slope=1, colour ="darkgrey", linetype="dashed")+labs(fill="Point density") p}cor_fun <-function(data, mapping, method="pearson", ndp=2, sz=3, stars=T, ...){ x <- GGally::eval_data_col(data, mapping$x) y <- GGally::eval_data_col(data, mapping$y) corr <-cor.test(x, y, method=method) est <- corr$estimate lb.size <- sz*abs(est) if(stars){ stars <-c("***", "**", "*", "")[findInterval(corr$p.value, c(0, 0.001, 0.01, 0.05, 1))] lbl <-paste0(method, ": ", round(est, ndp), stars) }else{ lbl <-round(est, ndp) }ggplot(data=data, mapping=mapping) +annotate("text", label=lbl, x=6, y=6, size=lb.size,...)+theme(panel.grid =element_blank())}setwd(output_path)png("repro_matrix.png", width=6, height=5, res=300, units ="in")GGally::ggpairs(repro_scatter_df, upper =list(continuous = cor_fun), lower =list(continuous = scatter_fn), title ="Reproducability matrix - comparisons of 2 samples", columnLabels =c("sample 1", "sample 2", "sample 3", "sample 4"), xlab="Number of crosslink events per binding site [log2]", ylab="Number of crosslink events per binding site [log2]", legend=c(4,1))+theme_bw()dev.off()# setwd("/Users/melinaklostermann/Documents/PURA_oe_newpip/PURA_newpip/Report09-PURA_oe_Def_of_BS_plots")# pdf("repro_matrix.pdf", width=5, height=5)# GGally::ggpairs(repro_scatter_df, upper = list(continuous = cor_fun), lower = list(continuous = scatter_fn), title = "Reproducability matrix - comparisons of 2 samples", columnLabels = c("sample 1", "sample 2", "sample 3", "sample 4"), xlab="Number of crosslink events per binding site [log2]", ylab="Number of crosslink events per binding site [log2]", legend=c(4,1))+# theme_bw()# dev.off()```As can be seen in the reproducibility matrix the BS of all samples correlate well together. The ECDF of the coefficent variation also shows a good reporducibility as a ECDF of 1 is reached at around 0.7 (1 is proposed as a cutoff). The cv depends on the standard deviation. Probably it is more meaningfull to look at the standard error of the experiment> As SE = SD /(number of observations per sample)^-0.5 for 4 samples the SE is half of the SD.# Bound Genes and regions## assign bound gene type - many binding sites overlap with multiple genes (non codings!)- in IGV it look like rather the non-coding RNA than the protein-coding RNA is bound in that case- we therefore use a hieracry on gene types: lncRNA > miRNA > miscRNA > snRNA > snoRNA > protein-coding```{r}############################# annote binding sites with the bound ( = overlapping) gene################################# overlaps are resolved by:# 1. for two genes of different type by the type hieracy# 2. for to protein coding genes: in this chunk one protein coding gene is assigned at random in the next chunk the region hieracry is used to make this decision and the gene info is changed thenbinding_sites <- binding_sites_repro annotation_gene <- annotation[annotation$type=="gene"]# overlaps of bs with any geneol <-findOverlaps(annotation_gene, binding_sites, ignore.strand=F)# index binding sitesbinding_sites$idx <-1:NROW(binding_sites)# get all possible regions overlapping with crosslinked nucleotide in a temporary filebinding_sites_temp1 <- binding_sites[subjectHits(ol)]binding_sites_temp1 <-sortSeqlevels(binding_sites_temp1)elementMetadata(binding_sites_temp1) <-c(elementMetadata(binding_sites_temp1), elementMetadata(annotation_gene[queryHits(ol), c("gene_id", "gene_type", "gene_name")]))table(binding_sites_temp1$gene_type)# chose for each binding site a gene from the highest possible hierarchy# if two genes from the same type would be present the first one is chosen (randomly)binding_sites_temp2 <- binding_sites_temp1 %>%as.data.frame(.) %>%# this file contains several entries for 1 binding site overlapping with more than one genegroup_by(idx) %>%# the idx colum is an index of the binding sitesarrange(factor(gene_type, levels =c("lncRNA", "miRNA", "miscRNA", "snRNA", "snoRNA", "protein_coding" )), .by_group = T) %>%# arrange by hierarcy dplyr::slice(1) # choose randomly the first gene (the chosen gene is changed in the next chunk for protein coding genes)binding_sites_temp2 <-makeGRangesFromDataFrame(binding_sites_temp2, keep.extra.columns = T)table(binding_sites_temp2$gene_type)# non coding rnas "fake" region non codingbs_temp_non_cod <- binding_sites_temp2[binding_sites_temp2$gene_type!="protein_coding",] %>%makeGRangesFromDataFrame(., keep.extra.columns = T)bs_temp_non_cod$type <-"non_coding"```## Binding site region by hiercary approach- Overlap Bs with mutiple regions and choose one with highest priority- hieracry: "three_prime_UTR" > "five_prime_UTR" > "CDS" > "intron" - only done for protein-coding genes```{r}# classify only protein coding genes, regions only from bound genesanno_regions <- annotation[annotation$gene_type =="protein_coding"& annotation$type %in%c("three_prime_UTR", "five_prime_UTR", "CDS") ]#get intronsintrons <-intronsByTranscript(anno_txdb, use.names=T) %>%unlist(.)introns$type ="intron"anno_regions <-c(anno_regions, introns)#overlaps of binding sites with protein coding genesbinding_sites_temp_pc <- binding_sites_temp2[binding_sites_temp2$gene_type=="protein_coding"]ol_bs <-findOverlaps(anno_regions, binding_sites_temp_pc-2, ignore.strand =FALSE)# get all possible regions overlapping with bsbinding_sites_temp_pc2 <- binding_sites_temp_pc[subjectHits(ol_bs)]binding_sites_temp_pc2 <-sortSeqlevels(binding_sites_temp_pc2)# get possible region and transcript annotation of bsanno_bs <-elementMetadata(anno_regions[queryHits(ol_bs)]) %>%as.data.frame(.) %>% dplyr::select(c("type", "gene_id"))# and add to temp binding sitescolnames(anno_bs) <-c("type", "gene_id.from_region")elementMetadata(binding_sites_temp_pc2) <-c(elementMetadata(binding_sites_temp_pc[subjectHits(ol_bs)]), anno_bs)# choose type highest in hiearcybinding_sites_temp_pc3 <-as.data.frame(binding_sites_temp_pc2) %>%mutate(., gene_id = gene_id.from_region ) %>%# this throws out regions that overlap with non-coding RNAs, with this line no binding sites in introns!!!group_by(idx) %>%# specific id per binding sitearrange(factor(type, levels =c("three_prime_UTR", "five_prime_UTR", "CDS", "intron")), .by_group = T) %>% dplyr::slice(1) # arrange by region, select first, transcript info is ignored herebinding_sites_temp_pc3 <-makeGRangesFromDataFrame(binding_sites_temp_pc3, keep.extra.columns = T)sortSeqlevels(binding_sites_temp_pc3)# merge BS from non-coding RNAs and binding sites with specific regionsbinding_sites_with_regions <-c(binding_sites_temp_pc3, bs_temp_non_cod)# some protein coding genes not have a region, because no trustworth transcript is annotated in this position of the genebs_without_region <- binding_sites[-queryHits(findOverlaps(binding_sites, binding_sites_with_regions))]```# Output```{r}saveRDS(binding_sites_with_regions, paste0(output_path , "bs_oe.rds"))```# Session Info```{r}sessionInfo()```